{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# GroupBy技术\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import DataFrame, Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      data1     data2 key1 key2\n",
      "0 -0.391481  0.411457    a  one\n",
      "1 -0.212856 -0.161720    a  two\n",
      "2  0.132626 -0.540404    b  one\n",
      "3  0.460141 -0.059319    b  two\n",
      "4 -1.968553 -0.752753    a  one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "key1\n",
       "a   -0.857630\n",
       "b    0.296383\n",
       "Name: data1, dtype: float64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],\n",
    "                'key2' : ['one', 'two', 'one', 'two', 'one'],\n",
    "                'data1' : np.random.randn(5),\n",
    "                'data2' : np.random.randn(5)})\n",
    "grouped = df['data1'].groupby(df['key1']) # 根据key1的值分组\n",
    "print(df)\n",
    "grouped.mean() # 对分组后数字型的列求平均值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "key1  key2\n",
       "a     one    -1.180017\n",
       "      two    -0.212856\n",
       "b     one     0.132626\n",
       "      two     0.460141\n",
       "Name: data1, dtype: float64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "means = df['data1'].groupby([df['key1'], df['key2']]).mean() # 根据key1/2分组,产生多重索引\n",
    "means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>key2</th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>key1</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>-1.180017</td>\n",
       "      <td>-0.212856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>0.132626</td>\n",
       "      <td>0.460141</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "key2       one       two\n",
       "key1                    \n",
       "a    -1.180017 -0.212856\n",
       "b     0.132626  0.460141"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "means.unstack() # 把内层索引变成列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "California  2005   -0.212856\n",
       "            2006    0.132626\n",
       "Ohio        2005    0.034330\n",
       "            2006   -1.968553\n",
       "Name: data1, dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])\n",
    "years = np.array([2005, 2005, 2006, 2005, 2006])\n",
    "df['data1'].groupby([states, years]).mean()\n",
    "# 使用group后，原始数据可以认为变为如下形式：\n",
    "#   data1      data2      key1  key2  states      years\n",
    "# 0 -0.127927   0.026962  a     one   Ohio        2005\n",
    "# 1 -1.424594  -0.800712  a     two   California  2005\n",
    "# 2  1.619073  -0.165311  b     one   California  2006\n",
    "# 3 -0.996192  -0.367086  b     two   Ohio        2005\n",
    "# 4  0.020317  -1.238209  a     one   Ohio        2006"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>key1</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>-0.857630</td>\n",
       "      <td>-0.167672</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>0.296383</td>\n",
       "      <td>-0.299861</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         data1     data2\n",
       "key1                    \n",
       "a    -0.857630 -0.167672\n",
       "b     0.296383 -0.299861"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('key1').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>key1</th>\n",
       "      <th>key2</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">a</th>\n",
       "      <th>one</th>\n",
       "      <td>-1.180017</td>\n",
       "      <td>-0.170648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>two</th>\n",
       "      <td>-0.212856</td>\n",
       "      <td>-0.161720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">b</th>\n",
       "      <th>one</th>\n",
       "      <td>0.132626</td>\n",
       "      <td>-0.540404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>two</th>\n",
       "      <td>0.460141</td>\n",
       "      <td>-0.059319</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              data1     data2\n",
       "key1 key2                    \n",
       "a    one  -1.180017 -0.170648\n",
       "     two  -0.212856 -0.161720\n",
       "b    one   0.132626 -0.540404\n",
       "     two   0.460141 -0.059319"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(['key1', 'key2']).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "key1  key2\n",
       "a     one     2\n",
       "      two     1\n",
       "b     one     1\n",
       "      two     1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(['key1', 'key2']).size() # 统计记录条数，类似SQL的group by然后再count。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 对分组进行迭代"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a\n",
      "---\n",
      "      data1     data2 key1 key2\n",
      "0 -0.391481  0.411457    a  one\n",
      "1 -0.212856 -0.161720    a  two\n",
      "4 -1.968553 -0.752753    a  one\n",
      "***\n",
      "b\n",
      "---\n",
      "      data1     data2 key1 key2\n",
      "2  0.132626 -0.540404    b  one\n",
      "3  0.460141 -0.059319    b  two\n",
      "***\n"
     ]
    }
   ],
   "source": [
    "for name, group in df.groupby('key1'): # 单列分组\n",
    "    print(name)\n",
    "    print('---')\n",
    "    print(group)\n",
    "    print('***')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a one\n",
      "---\n",
      "      data1     data2 key1 key2\n",
      "0 -0.391481  0.411457    a  one\n",
      "4 -1.968553 -0.752753    a  one\n",
      "***\n",
      "a two\n",
      "---\n",
      "      data1    data2 key1 key2\n",
      "1 -0.212856 -0.16172    a  two\n",
      "***\n",
      "b one\n",
      "---\n",
      "      data1     data2 key1 key2\n",
      "2  0.132626 -0.540404    b  one\n",
      "***\n",
      "b two\n",
      "---\n",
      "      data1     data2 key1 key2\n",
      "3  0.460141 -0.059319    b  two\n",
      "***\n"
     ]
    }
   ],
   "source": [
    "for (k1, k2), group in df.groupby(['key1', 'key2']): # 多列分组\n",
    "    print(k1, k2)\n",
    "    print('---')\n",
    "    print(group)\n",
    "    print('***')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a\n",
      "---\n",
      "      data1     data2 key1 key2\n",
      "0 -0.391481  0.411457    a  one\n",
      "1 -0.212856 -0.161720    a  two\n",
      "4 -1.968553 -0.752753    a  one\n",
      "***\n",
      "b\n",
      "---\n",
      "      data1     data2 key1 key2\n",
      "2  0.132626 -0.540404    b  one\n",
      "3  0.460141 -0.059319    b  two\n",
      "***\n"
     ]
    }
   ],
   "source": [
    "pieces = dict(list(df.groupby('key1')))\n",
    "for k, v in pieces.items():\n",
    "    print(k)\n",
    "    print('---')\n",
    "    print(v)\n",
    "    print('***')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "float64\n",
      "---\n",
      "      data1     data2\n",
      "0 -0.391481  0.411457\n",
      "1 -0.212856 -0.161720\n",
      "2  0.132626 -0.540404\n",
      "3  0.460141 -0.059319\n",
      "4 -1.968553 -0.752753\n",
      "***\n",
      "object\n",
      "---\n",
      "  key1 key2\n",
      "0    a  one\n",
      "1    a  two\n",
      "2    b  one\n",
      "3    b  two\n",
      "4    a  one\n",
      "***\n"
     ]
    }
   ],
   "source": [
    "grouped = df.groupby(df.dtypes, axis=1) # 默认根据列上的值做分组，axis=1使用行上的类型做分组。\n",
    "for k, v in dict(list(grouped)).items():\n",
    "    print(k)\n",
    "    print('---')\n",
    "    print(v)\n",
    "    print('***')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 选取一个或一组列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<pandas.core.groupby.SeriesGroupBy object at 0x00000202F1D641D0>\n",
      "<pandas.core.groupby.DataFrameGroupBy object at 0x00000202F1D642B0>\n"
     ]
    }
   ],
   "source": [
    "print(df.groupby('key1')['data1']) # 等价df['data1'].groupby(df['key1'])\n",
    "print(df.groupby('key1')[['data2']]) # df[['data2']].groupby(df['key1'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>key1</th>\n",
       "      <th>key2</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">a</th>\n",
       "      <th>one</th>\n",
       "      <td>-0.170648</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>two</th>\n",
       "      <td>-0.161720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">b</th>\n",
       "      <th>one</th>\n",
       "      <td>-0.540404</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>two</th>\n",
       "      <td>-0.059319</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              data2\n",
       "key1 key2          \n",
       "a    one  -0.170648\n",
       "     two  -0.161720\n",
       "b    one  -0.540404\n",
       "     two  -0.059319"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(['key1', 'key2'])[['data2']].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "key1  key2\n",
       "a     one    -0.170648\n",
       "      two    -0.161720\n",
       "b     one    -0.540404\n",
       "      two    -0.059319\n",
       "Name: data2, dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_grouped = df.groupby(['key1', 'key2'])['data2'] # 这里用'data2'而不是['data2']返回Series\n",
    "s_grouped.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 通过字典或Series进行分组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Joe</th>\n",
       "      <td>-0.439304</td>\n",
       "      <td>1.368053</td>\n",
       "      <td>0.088032</td>\n",
       "      <td>0.411676</td>\n",
       "      <td>1.506363</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Steve</th>\n",
       "      <td>0.354057</td>\n",
       "      <td>-0.473297</td>\n",
       "      <td>0.518869</td>\n",
       "      <td>0.274243</td>\n",
       "      <td>0.556453</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Wes</th>\n",
       "      <td>2.739920</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.815980</td>\n",
       "      <td>1.181493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Jim</th>\n",
       "      <td>-1.903019</td>\n",
       "      <td>-0.244090</td>\n",
       "      <td>1.958362</td>\n",
       "      <td>-1.311261</td>\n",
       "      <td>-0.153897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Travis</th>\n",
       "      <td>-0.228594</td>\n",
       "      <td>0.121615</td>\n",
       "      <td>0.855896</td>\n",
       "      <td>0.663112</td>\n",
       "      <td>0.183528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               a         b         c         d         e\n",
       "Joe    -0.439304  1.368053  0.088032  0.411676  1.506363\n",
       "Steve   0.354057 -0.473297  0.518869  0.274243  0.556453\n",
       "Wes     2.739920       NaN       NaN  0.815980  1.181493\n",
       "Jim    -1.903019 -0.244090  1.958362 -1.311261 -0.153897\n",
       "Travis -0.228594  0.121615  0.855896  0.663112  0.183528"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "people = DataFrame(np.random.randn(5, 5),\n",
    "                   columns=['a', 'b', 'c', 'd', 'e'],\n",
    "                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])\n",
    "people.loc[2:3, ['b', 'c']] = np.nan # 添加空值\n",
    "people"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>blue</th>\n",
       "      <th>red</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Joe</th>\n",
       "      <td>0.499707</td>\n",
       "      <td>2.435112</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Steve</th>\n",
       "      <td>0.793112</td>\n",
       "      <td>0.437214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Wes</th>\n",
       "      <td>0.815980</td>\n",
       "      <td>3.921412</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Jim</th>\n",
       "      <td>0.647101</td>\n",
       "      <td>-2.301006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Travis</th>\n",
       "      <td>1.519009</td>\n",
       "      <td>0.076549</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            blue       red\n",
       "Joe     0.499707  2.435112\n",
       "Steve   0.793112  0.437214\n",
       "Wes     0.815980  3.921412\n",
       "Jim     0.647101 -2.301006\n",
       "Travis  1.519009  0.076549"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "by_column = people.groupby(mapping, axis=1) # 每一行根据a/b/c/d/e对应的颜色求sum\n",
    "by_column.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>blue</th>\n",
       "      <th>red</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Joe</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Steve</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Wes</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Jim</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Travis</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        blue  red\n",
       "Joe        2    3\n",
       "Steve      2    3\n",
       "Wes        1    2\n",
       "Jim        2    3\n",
       "Travis     2    3"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "map_series = Series(mapping)\n",
    "people.groupby(map_series, axis=1).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 通过函数进行分组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.397596</td>\n",
       "      <td>1.123963</td>\n",
       "      <td>2.046394</td>\n",
       "      <td>-0.083606</td>\n",
       "      <td>2.533959</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.354057</td>\n",
       "      <td>-0.473297</td>\n",
       "      <td>0.518869</td>\n",
       "      <td>0.274243</td>\n",
       "      <td>0.556453</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>-0.228594</td>\n",
       "      <td>0.121615</td>\n",
       "      <td>0.855896</td>\n",
       "      <td>0.663112</td>\n",
       "      <td>0.183528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          a         b         c         d         e\n",
       "3  0.397596  1.123963  2.046394 -0.083606  2.533959\n",
       "5  0.354057 -0.473297  0.518869  0.274243  0.556453\n",
       "6 -0.228594  0.121615  0.855896  0.663112  0.183528"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "people.groupby(len).sum() # 根据索引名字的长度做group，然后求sum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">3</th>\n",
       "      <th>one</th>\n",
       "      <td>-0.439304</td>\n",
       "      <td>1.368053</td>\n",
       "      <td>0.088032</td>\n",
       "      <td>0.411676</td>\n",
       "      <td>1.181493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>two</th>\n",
       "      <td>-1.903019</td>\n",
       "      <td>-0.244090</td>\n",
       "      <td>1.958362</td>\n",
       "      <td>-1.311261</td>\n",
       "      <td>-0.153897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <th>one</th>\n",
       "      <td>0.354057</td>\n",
       "      <td>-0.473297</td>\n",
       "      <td>0.518869</td>\n",
       "      <td>0.274243</td>\n",
       "      <td>0.556453</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <th>two</th>\n",
       "      <td>-0.228594</td>\n",
       "      <td>0.121615</td>\n",
       "      <td>0.855896</td>\n",
       "      <td>0.663112</td>\n",
       "      <td>0.183528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              a         b         c         d         e\n",
       "3 one -0.439304  1.368053  0.088032  0.411676  1.181493\n",
       "  two -1.903019 -0.244090  1.958362 -1.311261 -0.153897\n",
       "5 one  0.354057 -0.473297  0.518869  0.274243  0.556453\n",
       "6 two -0.228594  0.121615  0.855896  0.663112  0.183528"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "key_list = ['one', 'one', 'one', 'two', 'two']\n",
    "people.groupby([len, key_list]).min()\n",
    "# key_list等于新加一列，结合len，形状如下：\n",
    "#                  a         b         c         d         e\n",
    "# Joe(3)     one,  0.254889 -0.812035  2.765460  1.113513  0.646795\n",
    "# Steve(5)   one,  1.507490  0.463545 -1.396887  0.728163  1.078788\n",
    "# Wes(3)     one, -2.099479       NaN       NaN  0.438572  0.134136\n",
    "# Jim(3)     two,  0.167685  1.772127  1.372546 -0.758560 -1.241066\n",
    "# Travis(6)  two, -0.834662 -0.933228  1.026441 -0.074524 -0.830303"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 根据索引级别分组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>cty</th>\n",
       "      <th colspan=\"3\" halign=\"left\">US</th>\n",
       "      <th colspan=\"2\" halign=\"left\">JP</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tenor</th>\n",
       "      <th>1</th>\n",
       "      <th>3</th>\n",
       "      <th>5</th>\n",
       "      <th>1</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.616084</td>\n",
       "      <td>-0.572021</td>\n",
       "      <td>-0.325839</td>\n",
       "      <td>-1.561540</td>\n",
       "      <td>-0.136162</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.785219</td>\n",
       "      <td>-0.711985</td>\n",
       "      <td>-1.047321</td>\n",
       "      <td>1.197329</td>\n",
       "      <td>-0.433373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.613726</td>\n",
       "      <td>-0.548456</td>\n",
       "      <td>-1.568695</td>\n",
       "      <td>-0.743757</td>\n",
       "      <td>1.479070</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.018869</td>\n",
       "      <td>-0.353071</td>\n",
       "      <td>1.056401</td>\n",
       "      <td>-0.513008</td>\n",
       "      <td>0.291997</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "cty          US                            JP          \n",
       "tenor         1         3         5         1         3\n",
       "0      0.616084 -0.572021 -0.325839 -1.561540 -0.136162\n",
       "1     -0.785219 -0.711985 -1.047321  1.197329 -0.433373\n",
       "2     -0.613726 -0.548456 -1.568695 -0.743757  1.479070\n",
       "3      0.018869 -0.353071  1.056401 -0.513008  0.291997"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],\n",
    "                                     [1, 3, 5, 1, 3]],\n",
    "                                    names=['cty', 'tenor'])\n",
    "hier_df = DataFrame(np.random.randn(4, 5), columns=columns)\n",
    "hier_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>cty</th>\n",
       "      <th>JP</th>\n",
       "      <th>US</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "cty  JP  US\n",
       "0     2   3\n",
       "1     2   3\n",
       "2     2   3\n",
       "3     2   3"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hier_df.groupby(level='cty', axis=1).count()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
